package com.ma.crawl_book.controller;

import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

import java.util.ArrayList;
import java.util.List;

@Component
public class CrawlListBook implements PageProcessor {
    
    private Site site = Site.me().setRetryTimes(5).setSleepTime(500);

    private static List<String> urls = new ArrayList<>();

    @Autowired
    CloneBook  cloneBook;
    
    @Override
    public void process(Page page) {
        System.out.println("url == " + page.getUrl().get());
        if (page.getUrl().regex("http://www.23us.so/list/3_[0-9]*.html").match()){
            // 当前页所有的url地址
             urls = page.getHtml().xpath("//td[@class=\"L\"][1]/a").links().all();
        }
    }

    @Override
    public Site getSite() {
        return site;
    }
    
    public void startCrawl(){

        for (int i = 1; i < 20; i++) {
            urls.clear();
            String url = "http://www.23us.so/list/3_" + i + ".html";
            Spider.create(new CrawlListBook()).addUrl(url).thread(100).run();
            for (String bookUrl : urls){
                cloneBook.startCrawl(bookUrl);
            }
        }
    }
}
